# Clear all objects (from the workspace)
rm(list = ls())
# Suppress Warning messages
options(warn = -1)
# Turn off scientific notation like 1e+06
# options(scipen=999)
options(stringsAsFactors = F)
# Load Libs
# # INSTALL with:
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("DESeq2")
library(tidyr)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
##
## plotMA
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
# LOAD provided functions
source("./script_ejercicios.R")
# LOAD data/matrixes
count_mat = read.table("./count_mat.txt", header = TRUE, sep = "\t")
FPKM_mat = read.table("./FPKM_mat.txt", header = TRUE, sep = "\t")
# Let's take a LOOK
count_mat[10000:10010, ]
FPKM_mat[10000:10010, ]
# FORMAT row names
count_mat = count_mat %>% unite("rowNames", geneId:geneName, remove = TRUE)
FPKM_mat = FPKM_mat %>% unite("rowNames", geneId:geneName, remove = TRUE)
row.names(count_mat) = count_mat[, 1]
row.names(FPKM_mat) = FPKM_mat[, 1]
# DROP extra data
drops <- c("rowNames", "type", "specific_type")
count_mat = count_mat[, !(names(count_mat) %in% drops)]
FPKM_mat = FPKM_mat[, !(names(FPKM_mat) %in% drops)]
#count_mat["geneId"]
# gsub(” “, “_”,(trimws(gsub(“\\_+”, ” “, stringVec3))))
# row.names(count_mat) <- count_mat[]
# Let's take a LOOK
count_mat[10000:10010, ]
FPKM_mat[10000:10010, ]
# FILTRAR genes - conteos
# Para cada gen, contar el número de muestras con mayor a 5 conteos
count_mat_filter <-
apply(count_mat, 1, function(x)
length(which(x >= 5)))
table(count_mat_filter)
## count_mat_filter
## 0 1 2 3 4 5 6 7 8 9
## 19546 882 681 727 474 447 562 640 686 11226
# Filtrar genes con menor a 2 muestras con más de 5 conteos
count_mat <- count_mat[which(count_mat_filter >= 2),]
# Let's take a LOOK
dim(count_mat)
## [1] 15443 9
count_mat[10000:10010, ]
# FILTRAR genes - FPKM
# DO log base 2
FPKM_mat <- glog(FPKM_mat)
#Para cada gen, contar el número de muestras con expresión mayor a 2
FPKM_mat_filter <-
apply(FPKM_mat, 1, function(x)
length(which(x >= 2)))
#Filtrar genes con menor a 2 muestras con expresión mayor a 2
FPKM_mat <- FPKM_mat[which(FPKM_mat_filter >= 2), ]
# Let's take a LOOK
dim(FPKM_mat)
## [1] 15787 9
FPKM_mat[10000:10010, ]
# Let's see what genes share count_mat and FPKM_mat
length(intersect(rownames(count_mat),rownames(FPKM_mat)))
## [1] 14068
Conteos [28 vs 0 días]
count_mat_28v0 = count_mat[,-grep('d14_', colnames(count_mat))]
# Conteos
aux_classes = rep(0, times = ncol(count_mat_28v0)) # CLASSIFY "d0_" as 0s
aux_classes[grep(pattern = "d28_", x = colnames(count_mat_28v0))] = 1
aux_classes
## [1] 0 0 0 1 1
colnames(count_mat_28v0)
## [1] "d0_r1" "d0_r2" "d0_r4" "d28_r8" "d28_r14"
count_results = DESeq_func(matrix_c = count_mat_28v0, classes_c = aux_classes)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
count_results = count_results[order(count_results$pvalue),]
count_results
# HISTOGRAM
pvals = count_results["pvalue"]
hist(
pvals[, 1],
prob = TRUE,
col = "black",
border = "white",
xlab = "scores",
breaks = 100
)
box(bty = "l")
# Draw density function (assuming normal dist)
score_mean = mean(pvals[, 1])
score_sd = sd(pvals[, 1])
curve(
dnorm(x, mean = score_mean, sd = score_sd),
add = TRUE,
col = "red",
lwd = 2
)

# Let's take a look to some genes
count_mat_28v0["ENSG00000128567.16_PODXL",]
count_mat_28v0["ENSG00000185559.14_DLK1",]
Conteos [14 vs 0 días]
count_mat_14v0 = count_mat[,-grep('d28_', colnames(count_mat))]
# Conteos
aux_classes = rep(0, times = ncol(count_mat_14v0)) # CLASSIFY "d0_" as 0s
aux_classes[grep(pattern = "d14_", x = colnames(count_mat_14v0))] = 1
aux_classes
## [1] 0 0 0 1 1 1 1
colnames(count_mat_14v0)
## [1] "d0_r1" "d0_r2" "d0_r4" "d14_r7" "d14_r8" "d14_r9" "d14_r10"
count_results = DESeq_func(matrix_c = count_mat_14v0, classes_c = aux_classes)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
count_results = count_results[order(count_results$pvalue),]
count_results
# HISTOGRAM
pvals = count_results["pvalue"]
hist(
pvals[, 1],
prob = TRUE,
col = "black",
border = "white",
xlab = "scores",
breaks = 100
)
box(bty = "l")
# Draw density function (assuming normal dist)
score_mean = mean(pvals[, 1])
score_sd = sd(pvals[, 1])
curve(
dnorm(x, mean = score_mean, sd = score_sd),
add = TRUE,
col = "red",
lwd = 2
)

# Let's take a look to some genes
count_mat_14v0["ENSG00000265992.1_ESRG",]
count_mat_14v0["ENSG00000185559.14_DLK1",]
FPKM [28 vs 0 días]
FPKM_mat_28v0 = FPKM_mat[,-grep('d14_', colnames(FPKM_mat))]
# FPKM
aux_classes = rep(0, times = ncol(FPKM_mat_28v0)) # CLASSIFY "d0_" as 0s
aux_classes[grep(pattern = "d28_", x = colnames(FPKM_mat_28v0))] = 1
aux_classes
## [1] 0 0 0 1 1
colnames(FPKM_mat_28v0)
## [1] "d0_r1" "d0_r2" "d0_r4" "d28_r8" "d28_r14"
fpkm_results =
limma4DS_fdr(
matrix_e = FPKM_mat_28v0,
classes_e = aux_classes,
classes_names = c("d0_", "d28_")
)
## toptable() is deprecated and will be removed in the future version of limma. Please use topTable() instead.
fpkm_results = fpkm_results[order(fpkm_results$p.value),]
fpkm_results
# HISTOGRAM
pvals = fpkm_results["p.value"]
hist(
pvals[, 1],
prob = TRUE,
col = "black",
border = "white",
xlab = "scores",
breaks = 100
)
box(bty = "l")
# Draw density function (assuming normal dist)
score_mean = mean(pvals[, 1])
score_sd = sd(pvals[, 1])
curve(
dnorm(x, mean = score_mean, sd = score_sd),
add = TRUE,
col = "red",
lwd = 2
)

# Let's take a look to some genes
FPKM_mat_28v0["ENSG00000261780.2_LINC02582",]
FPKM_mat_28v0["ENSG00000197406.7_DIO3",]
FPKM [14 vs 0 días]
FPKM_mat_14v0 = FPKM_mat[,-grep('d28_', colnames(FPKM_mat))]
# FPKM
aux_classes = rep(0, times = ncol(FPKM_mat_14v0)) # CLASSIFY "d0_" as 0s
aux_classes[grep(pattern = "d14_", x = colnames(FPKM_mat_14v0))] = 1
aux_classes
## [1] 0 0 0 1 1 1 1
colnames(FPKM_mat_14v0)
## [1] "d0_r1" "d0_r2" "d0_r4" "d14_r7" "d14_r8" "d14_r9" "d14_r10"
fpkm_results =
limma4DS_fdr(
matrix_e = FPKM_mat_14v0,
classes_e = aux_classes,
classes_names = c("d0_", "d14_")
)
## toptable() is deprecated and will be removed in the future version of limma. Please use topTable() instead.
fpkm_results = fpkm_results[order(fpkm_results$p.value),]
fpkm_results
# HISTOGRAM
pvals = fpkm_results["p.value"]
hist(
pvals[, 1],
prob = TRUE,
col = "black",
border = "white",
xlab = "scores",
breaks = 100
)
box(bty = "l")
# Draw density function (assuming normal dist)
score_mean = mean(pvals[, 1])
score_sd = sd(pvals[, 1])
curve(
dnorm(x, mean = score_mean, sd = score_sd),
add = TRUE,
col = "red",
lwd = 2
)

# Let's take a look to some genes
FPKM_mat_14v0["ENSG00000253507.5_AC104257.1",]
FPKM_mat_14v0["ENSG00000185559.14_DLK1",]
# Enable Warning messages
options(warn = 0)